library(data.table)
library(tidyverse)
library(lattice)
library(gridExtra)
library(rtracklayer)
library(DiffBind)
library(idr2d)
library(patchwork)
library(reactable)
# set your working directory
setwd("~/Documents/HGEN_663/extra/lec10")
We read in MACS peak calls and select coordinate columns as well as the p-value column, then run IDR
idr <- c('KO','KO2') %>%
paste0('_H3K27ac_peaks.narrowPeak') %>%
lapply(fread, select = c(1:3,8)) %>%
{estimate_idr1d(.[[1]], .[[2]], value_transformation = 'identity')} %>%
.$rep1_df
ggplot(idr, aes(x = rank, y = rep_rank, color = idr)) +
geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
facet_grid(.~'rank') + theme(legend.position = 'none') +
ggplot(idr, aes(x = value, y = rep_value, color = idr)) +
geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
facet_grid(.~'value')
We can take the called peaks and merge them to obtain a reference set of genomic intervals – into which we can count reads into just as we’ve done with genes. This can be done easily with DiffBind using a sample sheet containing the locations and BAM / peak files as well as some metadata.
# You don't need to run this chunk in class
dat <- dba(sampleSheet = "samples.csv")
dat <- dba.count(dat)
But since the counting process takes a bit of time, we’ll start with the pre-computed object (lec10.RData). As with transcriptomic and methylomic matrices, we can again perform routine analyses such as PCA and hierchical clustering
load('lec10.RData')
dba.plotPCA(dat, label = DBA_CONDITION)
dba.plotHeatmap(dat, correlations = T)
When we have many regions of interest across the genome, we can assess the distribution of signals surrounding such areas and evaluate the overall trend. One tool for performing this is deepTools, and we can examine the results of their computeMatrix module.
# You don't need to run this chunk in class
library(parallel)
mats <- mclapply(list.files('mats', full.names = T), function(x)
colMeans(fread(x, skip = 1, header = F, drop = 1:6),
na.rm = T), mc.cores = 12) %>%
setNames(list.files('mats'))
mat.ex <- fread('mats/KO_H3K27ac.rx.promoter.mat.gz', skip = 1, header = F)
We’ll first take the KO sample’s H3K27ac enrichment surrounding promoters as an example
# use only the matrix part, throw away the coordinates info
dat.ex <- mat.ex[,-(1:6)] %>%
arrange(rowMeans(.)) %>%
as.matrix
# plot
levelplot(t(dat.ex), useRaster = T, xlab = "Position", ylab = "Interval",
scales = list(draw = F), col.regions = clrs,
at = seq(-2, 3, length.out = 100), aspect = "fill")
Given the link between promoter acetylation and the cognate gene’s expression, we further incorporate RNA-seq data. Again, for the sake of time, it’s been imported and provided in lec10.RData
# You don’t need to run this chunk in class
exps <- fread('BT245-KO-C4.UCSC.featureCounts.txt.counts')
library(biomaRt)
ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
host = "grch37.ensembl.org",
path = "/biomart/martservice",
dataset = "hsapiens_gene_ensembl")
res <- getBM(attributes = c("external_gene_name", "chromosome_name",
"start_position", "end_position"),
filters = "external_gene_name", values = exps$Geneid,
mart = ensembl)
dict <- left_join(res[!duplicated(res$external_gene_name) &
res$external_gene_name %in% exps$Geneid,],
exps[exps$Geneid %in% res$external_gene_name],
by = c("external_gene_name" = "Geneid")) %>%
filter(chromosome_name %in% c(1:22, "X", "Y"))
With the expresion data in hand, we now combine the two modalities
# get promoters intervals from matrix
pmts <- makeGRangesFromDataFrame(mat.ex,
seqnames.field = 'V1',
start.field = 'V2',
end.field = 'V3')
# get gene intervals overlapping promoters
dict$chr <- paste0("chr", dict$chromosome_name)
genes <- makeGRangesFromDataFrame(dict,
start.field = "start_position",
end.field = "end_position",
seqnames.field = "chr",
keep.extra.columns = T)
pmts.ol <- pmts[overlapsAny(pmts, genes)]
genes.ol <- genes[overlapsAny(genes, pmts)]
# partition genes into quintiles by expression
ranks <- rank(genes.ol$counts/abs((end(genes.ol) - start(genes.ol))),
ties.method = "first")
genes.ol$cut <- as.numeric(cut(ranks, quantile(ranks, probs = seq(0, 1, 0.2)),
include.lowest = TRUE))
pmts.cut <- data.frame(V4 = sprintf("%s:%d-%d", seqnames(pmts.ol),
start(pmts.ol),end(pmts.ol)),
cut = genes.ol$cut[findOverlaps(pmts.ol, genes.ol,
select = "first")])
# integrate
dat.ex <- left_join(pmts.cut, mat.ex[,-c(1:3, 5, 6)]) %>%
select(-V4) %>%
arrange(cut, rowMeans(select(., -cut))) %>%
split(., .$cut)
We can plot the results as heatmaps again
# plot heatmap for each quintile
lapply(1:5, function(i) {
levelplot(t(dat.ex[[i]][,2:601]), useRaster = T,
xlab = NULL, ylab = NULL, scales = list(draw = F),
col.regions = clrs, aspect = "fill",
at = seq(-2, 3, length.out = 100),
colorkey = F, margin = F)
}) %>% grid.arrange(grobs = ., ncol = 1)
Or alternatively take the column-wise average across said heatmap as a summary
## Warning in melt(.): The melt generic in data.table has been passed a list and
## will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(.). In the
## next version, this warning will become an error.
Or more directly we can just assess the relationship per-promoter
## Joining with `by = join_by(V4)`
We’ve been only looking at one sample thus far, and to contrast difference samples we’ll need to make sure the scales are meaningfully comparable
Taking our previous promoter aggregate plots, different methods would point to rather divergent conclusions
# define groups
ann <- tibble(f = names(mats)) %>%
separate(f, c('samps', 'marks', 'norms', 'regs', NA, NA), '[_\\.]', F)
# what to plot
mark <- 'H3K27ac'
reg <- 'promoter'
# label groups
dats <- melt(mats[ann$f[ann$marks == mark & ann$regs == reg]]) %>%
rownames_to_column("row") %>%
mutate(row = (as.numeric(row) - 1) %% (n()/6) + 1) %>%
merge(dplyr::rename(ann, L1 = f)) %>%
mutate(norms = factor(norms, levels = c('raw', 'inp', 'rx'))) %>%
select(-L1)
# label axis
len <- nrow(dats) / 6 / 3
if (len * 10 < 1000) {
lab.l <- sprintf("-%db", len)
lab.r <- sprintf("+%db", len)
} else {
lab.l <- sprintf("-%dkb", len / 100)
lab.r <- sprintf("+%dkb", len / 100)
}
# plot
ggplot(dats, aes(x = row, y = value, color = samps)) +
annotate("rect", xmin = len, xmax = len * 2, ymin = -Inf,
ymax = Inf, alpha = 0.2, fill = "grey") +
geom_line() +
labs(y = "Enrichment", title = sprintf("%s @ %s", mark, reg)) +
scale_x_continuous(breaks = c(1, len * 1:3),
labels = c(lab.l, "Start", "End", lab.r),
expand = c(0, 0), name = "Position") +
facet_wrap(. ~ norms, scales = "free") + theme_bw()
As opposed to limiting ourselves to some set of pre-defined regions, we could also just take the counts in uniformly divided windows across the genome, which is stored cts object (a list of bedGraph files imported like below)
# You don’t need to run this chunk in class
cts <- lapply(list.files('10kb', full.names = T), import.bedGraph) %>%
setNames(sub('.10kb.bed', '', list.files('10kb')))
We now apply various normalization factors
# read in scaling factors
facs <- fread('~/Documents/HGEN_663/extra/lec10/rx.csv')
facs <- facs %>%
mutate(sample = paste(facs$condition, facs$mark, sep = "_")) %>%
rowwise() %>%
mutate(ref = min(hs_chip, hs_input)) %>%
ungroup() %>%
mutate(depthfac1 = ref / hs_chip,
depthfac2 = ref / hs_input,
rx = (hs_chip / hs_input) / (dm_chip / dm_input),
scalefac1 = rx * depthfac1)
facs %>% reactable()
# normalize
for (samp in facs$sample) {
# get ratios
rx <- facs$scalefac1[facs$sample == samp]
r1 <- facs$depthfac1[facs$sample == samp]
r2 <- facs$depthfac2[facs$sample == samp]
# scale by factor
num.input <- r1 * cts[[samp]]$score + 1
num.rx <- rx * cts[[samp]]$score + 1
denom <- r2 * cts[[sub('_.*', '_input', samp)]]$score + 1
cts[[samp]]$rx <- log2(num.rx / denom)
cts[[samp]]$input <- log2(num.input / denom)
}
And take H3K27me3 as an example
# compare counts
mark <- 'H3K27me3'
samp1 <- paste('KO', mark, sep = '_')
samp2 <- paste('K27M', mark, sep = '_')
smoothScatter(x = log2(cts[[samp1]]$score + 1),
y = log2(cts[[samp2]]$score + 1),
xlab = samp1, ylab = samp2, main = "Raw")
abline(0,1)
Or as MA plots
ma <- list()
ma[["raw"]] <- data.frame(m = log(cts[[samp1]]$score) - log(cts[[samp2]]$score),
a = 0.5 * (log(cts[[samp1]]$score) + log(cts[[samp2]]$score)))
ma[["input"]] <- data.frame(m = cts[[samp1]]$input - cts[[samp2]]$input,
a = 0.5 * (cts[[samp1]]$input + cts[[samp2]]$input))
ma[["rx"]] <- data.frame(m = cts[[samp1]]$rx - cts[[samp2]]$rx,
a = 0.5 * (cts[[samp1]]$rx + cts[[samp2]]$rx))
par(mfrow = c(1,3))
for(nm in names(ma)) {
smoothScatter(x = ma[[nm]]$a, y = ma[[nm]]$m, ylab = "M", xlab = "A",
main = sprintf("%s - %s (%s)", samp1, samp2, nm))
abline(h = 0)
}
# define groups
ann <- tibble(f = names(mats)) %>%
separate(f, c('samps', 'marks', 'norms', 'regs', NA, NA), '[_\\.]', F)
# check what other regulatory regions are avaiable
ann$regs
# check what other marks you can plot
ann$marks
# what to plot
mark <- ''
reg <- ''
# label groups
dats <-
# label axis
len <- nrow(dats) / 6 / 3
if (len * 10 < 1000) {
lab.l <- sprintf("-%db", len)
lab.r <- sprintf("+%db", len)
} else {
lab.l <- sprintf("-%dkb", len / 100)
lab.r <- sprintf("+%dkb", len / 100)
}
# plot
ggplot()
Other marks: H3K36me2, H3K27ac
# compare counts
mark <- ''
samp1 <- paste('KO', mark, sep = '_')
samp2 <- paste('K27M', mark, sep = '_')
smoothScatter(x = log2(cts[[samp1]]$score + 1),
y = log2(cts[[samp2]]$score + 1),
xlab = samp1, ylab = samp2, main = "Raw")
abline(0,1)
#
ma <- list()
ma[["raw"]] <- data.frame(m = log(cts[[samp1]]$score) - log(cts[[samp2]]$score),
a = 0.5 * (log(cts[[samp1]]$score) + log(cts[[samp2]]$score)))
ma[["input"]] <- data.frame(m = cts[[samp1]]$input - cts[[samp2]]$input,
a = 0.5 * (cts[[samp1]]$input + cts[[samp2]]$input))
ma[["rx"]] <- data.frame(m = cts[[samp1]]$rx - cts[[samp2]]$rx,
a = 0.5 * (cts[[samp1]]$rx + cts[[samp2]]$rx))
par(mfrow = c(1,3))
for(nm in names(ma)) {
smoothScatter(x = ma[[nm]]$a, y = ma[[nm]]$m, ylab = "M", xlab = "A",
main = sprintf("%s - %s (%s)", samp1, samp2, nm))
abline(h = 0)
}